# # Load csvs
# datExpr = read.csv('./../raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
# datMeta = read.csv('./../raw_data/RNAseq_ASD_datMeta.csv')
# 
# # Make sure datExpr and datMeta columns/rows match
# rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
# if(!all(colnames(datExpr) == rownames(datMeta))){
#   print('Columns in datExpr don\'t match the rowd in datMeta!')
# }
# 
# # Annotate probes
# getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
#             'end_position','strand','band','gene_biotype','percentage_gc_content')
# mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
#                dataset='hsapiens_gene_ensembl',
#                host='feb2014.archive.ensembl.org') ## Gencode v19
# datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
# datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
# datProbes$length = datProbes$end_position-datProbes$start_position
# 
# # Group brain regions by lobes
# datMeta$Brain_Region = as.factor(datMeta$Region)
# datMeta$Brain_lobe = 'Occipital'
# datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
# datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
# datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
# datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))
# 
# #################################################################################
# # FILTERS:
# 
# # 1 Filter probes with start or end position missing (filter 5)
# # These can be filtered without probe info, they have weird IDS that don't start with ENS
# to_keep = !is.na(datProbes$length)
# datProbes = datProbes[to_keep,]
# datExpr = datExpr[to_keep,]
# rownames(datProbes) = datProbes$ensembl_gene_id
# 
# # 2. Filter samples from ID AN03345 (filter 2)
# to_keep = (datMeta$Subject_ID != 'AN03345')
# datMeta = datMeta[to_keep,]
# datExpr = datExpr[,to_keep]
# 
# # 3. Filter samples with rowSums <= 40
# to_keep = rowSums(datExpr)>40
# datExpr = datExpr[to_keep,]
# datProbes = datProbes[to_keep,]
# 
# if(!file.exists('./../working_data/genes_ASD_DE_info_DESeq2.csv')){
#   counts = as.matrix(datExpr)
#   rowRanges = GRanges(datProbes$chromosome_name,
#                       IRanges(datProbes$start_position, width=datProbes$length),
#                       strand=datProbes$strand,
#                       feature_id=datProbes$ensembl_gene_id)
#   
#   se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
#   ddsSE = DESeqDataSet(se, design =~Diagnosis_)
#   
#   dds = DESeq(ddsSE)
#   DE_info = results(dds) %>% data.frame %>% rownames_to_column(var = 'ID') %>%
#                    mutate('logFC_DESeq2'=log2FoldChange, 'adj.P.Val_DESeq2'=padj) %>% 
#                    dplyr::select(ID, logFC_DESeq2, adj.P.Val_DESeq2)
#   
#   write.csv(DE_info_DESeq2, './../working_data/genes_ASD_DE_info_DESeq2.csv', row.names = FALSE)
#   
#   rm(counts, rowRanges, se, ddsSE, dds, mart)
#   
# } else DE_info = read.csv('./../working_data/genes_ASD_DE_info_DESeq2.csv')
# 
# save(file='./../working_data/RNAseq_ASD_4region_DEgenes_vst_DESeq2.Rdata', datExpr, datMeta, datProbes, DE_info)

load('./../working_data/RNAseq_ASD_4region_DEgenes_vst_DESeq2.Rdata')

# Scale gene expression values
# datExpr = datExpr %>% t %>% scale %>% t %>% data.frame

#################################################################################
# SFARI Genes
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')
## Parsed with column specification:
## cols(
##   status = col_double(),
##   `gene-symbol` = col_character(),
##   `gene-name` = col_character(),
##   chromosome = col_character(),
##   `genetic-category` = col_character(),
##   `gene-score` = col_double(),
##   syndromic = col_double(),
##   `number-of-reports` = col_double()
## )
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19

gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'), 
                   values=SFARI_genes$`gene-symbol`, mart=mart) %>% 
                   mutate('gene-symbol'=hgnc_symbol, 'ID'=as.character(ensembl_gene_id)) %>% 
                   dplyr::select('ID', 'gene-symbol')

SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')


#################################################################################
# Add functional annotation to genes from GO

GO_annotations = read.csv('./../working_data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID' = as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal' = 1)

# Add SFARI and GO info to DE_info
genes_DE_info = DE_info %>% dplyr::select(-X) %>% left_join(SFARI_genes, by='ID') %>% 
                mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`),
                        syndromic=ifelse(is.na(syndromic), 0, syndromic),
                        logFC=logFC_DESeq2, adj.P.Val=adj.P.Val_DESeq2) %>%
                dplyr::select(ID, logFC, adj.P.Val, `gene-symbol`, `gene-score`,syndromic) %>%
                mutate(`gene-score`=ifelse(`gene-score`=='None' & ID %in% GO_neuronal$ID, 'Neuronal', `gene-score`))


datExpr_backup = datExpr

# Filter DE genes
# datExpr = datExpr[DE_info$adj.P.Val_DESeq2<0.05 & DE_info$logFC_DESeq2>log2(1.2),]

rm(mart, gene_names, GO_annotations, DE_info)

Number of genes:

nrow(datExpr)
## [1] 33478

Gene count by SFARI score:

table(SFARI_genes$`gene-score`)
## 
##   1   2   3   4   5   6 
##  29  82 209 538 191  25

Gene count by brain lobe:

table(datMeta$Brain_lobe)
## 
##   Frontal  Temporal  Parietal Occipital 
##        21        20        22        23

Boxplot of logFC by score

Genes with scores 2 and 3 have slightly higher quartiles than genes without a score

ggplotly(genes_DE_info %>% ggplot(aes(`gene-score`, abs(logFC), fill=`gene-score`)) + 
         geom_boxplot() + scale_fill_manual(values=gg_colour_hue(8)) + theme_minimal())

Changes in PCA plots for different filtering thresholds

lfc=-1 means no filtering at all, the rest of the filterings include on top of the defined lfc, an adjusted p-value lower than 0.05

lfc_list = seq(0, 2, 0.1)

n_genes = nrow(datExpr)

# Calculate PCAs
datExpr_pca_samps = log2(datExpr+1) %>% data.frame %>% t %>% prcomp(scale.=TRUE)
datExpr_pca_genes = log2(datExpr+1) %>% data.frame %>% prcomp(scale.=TRUE)

# Initialice DF to save PCA outputs
pcas_samps = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
             mutate('ID'=colnames(datExpr), 'lfc'=-1, PC1=scale(PC1), PC2=scale(PC2))
pcas_genes = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
             mutate('ID'=rownames(datExpr), 'lfc'=-1, PC1=scale(PC1), PC2=scale(PC2))

pca_samps_old = pcas_samps
pca_genes_old = pcas_genes

for(lfc in lfc_list){
  
  # Filter DE genes with iteration's criteria
  DE_genes = genes_DE_info %>% filter(adj.P.Val<0.05 & abs(logFC)>lfc)
  datExpr_DE = datExpr %>% data.frame %>% filter(rownames(.) %in% DE_genes$ID)
  n_genes = c(n_genes, nrow(DE_genes))
  
  # Calculate PCAs
  datExpr_pca_samps = log2(datExpr_DE+1) %>% t %>% prcomp(scale.=TRUE)
  datExpr_pca_genes = log2(datExpr_DE+1) %>% prcomp(scale.=TRUE)

  # Create new DF entries
  pca_samps_new = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                  mutate('ID'=colnames(datExpr), 'lfc'=lfc, PC1=scale(PC1), PC2=scale(PC2))
  pca_genes_new = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                  mutate('ID'=DE_genes$ID, 'lfc'=lfc, PC1=scale(PC1), PC2=scale(PC2))  
  
  # Change PC sign if necessary
  if(cor(pca_samps_new$PC1, pca_samps_old$PC1)<0) pca_samps_new$PC1 = -pca_samps_new$PC1
  if(cor(pca_samps_new$PC2, pca_samps_old$PC2)<0) pca_samps_new$PC2 = -pca_samps_new$PC2
  if(cor(pca_genes_new$PC1, pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC1 )<0){
    pca_genes_new$PC1 = -pca_genes_new$PC1
  }
  if(cor(pca_genes_new$PC2, pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC2 )<0){
    pca_genes_new$PC2 = -pca_genes_new$PC2
  }
  
  pca_samps_old = pca_samps_new
  pca_genes_old = pca_genes_new
  
  # Update DFs
  pcas_samps = rbind(pcas_samps, pca_samps_new)
  pcas_genes = rbind(pcas_genes, pca_genes_new)
  
}

# Add Diagnosis/SFARI score information
pcas_samps = pcas_samps %>% mutate('ID'=substring(ID,2)) %>% 
             left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
             dplyr::select(ID, PC1, PC2, lfc, Diagnosis_, Brain_lobe)
pcas_genes = pcas_genes %>% left_join(SFARI_genes, by='ID') %>% 
             mutate('score'=as.factor(`gene-score`)) %>%
             dplyr::select(ID, PC1, PC2, lfc, score)

# Plot change of number of genes
ggplotly(data.frame('lfc'=lfc_list, 'n_genes'=n_genes[-1]) %>% ggplot(aes(x=lfc, y=n_genes)) + 
         geom_point() + geom_line() + theme_minimal() + 
         ggtitle('Number of remaining genes when modifying filtering threshold'))
rm(datExpr_pca_genes, datExpr_pca_samps, DE_genes, datExpr_DE, pca_genes_new, pca_samps_new, 
   pca_genes_old, pca_samps_old, lfc_list, lfc)

Samples

Note: PC values get smaller as Log2 fold change increases, so on each iteration the values were scaled so it would be easier to compare between frames

Coloured by Diagnosis:

ggplotly(pcas_samps %>% ggplot(aes(PC1, PC2, color=Diagnosis_)) + geom_point(aes(frame=lfc, ids=ID)) + 
         theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))

Coloured by brain region:

No recognisable pattern

ggplotly(pcas_samps %>% ggplot(aes(PC1, PC2, color=Brain_lobe)) + geom_point(aes(frame=lfc, ids=ID)) + 
         theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))

Genes

SFARI genes coloured by score

pcas_sfari_genes = pcas_genes %>% filter(!is.na(score)) %>% dplyr::select(-'score')

complete_sfari_df = expand.grid(unique(pcas_sfari_genes$ID), unique(pcas_sfari_genes$lfc))
colnames(complete_sfari_df) = c('ID', 'lfc')

pcas_sfari_genes = pcas_sfari_genes %>% right_join(complete_sfari_df, by=c('ID','lfc')) %>% 
                   left_join(SFARI_genes, by='ID') %>% 
                   mutate('score'=as.factor(`gene-score`), 'syndromic'=as.factor(syndromic))
pcas_sfari_genes[is.na(pcas_sfari_genes)] = 0 # Fix for ghost points
  
ggplotly(pcas_sfari_genes %>% ggplot(aes(PC1, PC2, color=score)) + 
         geom_point(aes(frame=lfc, ids=ID), alpha=0.6) + theme_minimal() + 
         ggtitle('Genes PCA plot modifying filtering threshold'))

Most of the genes get filtered out by the first adjusted p-value<0.05 filter (including all the genes with score=1 except two), but the proportion of genes left after the first cut is significantly higher for all scores and they seem to be generally filtered out after as well

table(SFARI_genes$`gene-score`[SFARI_genes$ID %in% genes_DE_info$ID[genes_DE_info$adj.P.Val<0.05]])
## 
##   1   2   3   4   5   6 
##   2  14  67 154  61  11
pcas_genes = pcas_genes %>% mutate(score=ifelse(is.na(score) & ID %in% GO_neuronal$ID, 'Neuronal', score))

# Calculate percentage of genes remaining on each lfc by each score
score_count_by_lfc = pcas_genes %>% filter(!is.na(score)) %>% group_by(lfc, score) %>% tally %>% ungroup
score_count_pcnt = score_count_by_lfc %>% filter(lfc==-1) %>% mutate('n_init'=n) %>%
                   dplyr::select(score, n_init) %>% right_join(score_count_by_lfc, by='score') %>%
                   mutate('pcnt'=round(n/n_init*100, 2)) %>% filter(lfc!=-1)

# Complete missing entries with zeros
complete_score_count_pcnt = expand.grid(unique(score_count_pcnt$lfc), unique(score_count_pcnt$score))
colnames(complete_score_count_pcnt) = c('lfc', 'score')
score_count_pcnt = full_join(score_count_pcnt, complete_score_count_pcnt, by=c('lfc','score')) %>%
                   dplyr::select(score, lfc, n, pcnt)
score_count_pcnt[is.na(score_count_pcnt)] = 0

# Join counts by score and all genes
all_count_pcnt = pcas_genes %>% group_by(lfc) %>% tally  %>% filter(lfc!=-1) %>% 
                 mutate('pcnt'=round(n/nrow(datExpr)*100, 2), 'score'='All')
score_count_pcnt = rbind(score_count_pcnt, all_count_pcnt)

ggplotly(score_count_pcnt %>% ggplot(aes(lfc, pcnt, color=score)) + geom_point() + geom_line() + 
         scale_colour_manual(palette=gg_colour_hue) + theme_minimal() + 
         ggtitle('% of points left after each increase in log2 fold change'))
rm(score_count_by_lfc, complete_score_count_pcnt)

SFARI genes coloured by syndromic tag

Most syndromic genes get filtered out with the p-value threshold, the remaining ones don’t seem to have a very different behaviour to the rest of the data, perhaps they survive a bit longer.

ggplotly(pcas_sfari_genes %>% ggplot(aes(PC1, PC2, color=ordered(syndromic, levels=c(1,0)))) + 
         geom_point(aes(frame=lfc, ids=ID), alpha=0.6) + theme_minimal() + 
         scale_colour_manual(palette=gg_colour_hue) +
         ggtitle('Genes PCA plot modifying filtering threshold'))
# Calculate percentage of syndromic genes remaining on each lfc
syndromic_count_by_lfc = pcas_sfari_genes %>% filter(syndromic==1 & PC1!=0) %>% group_by(lfc) %>% tally %>% 
                         ungroup %>% filter(lfc!=-1) %>% 
                         mutate('pcnt' = round(n/nrow(SFARI_genes[SFARI_genes$syndromic==1,])*100,2), 'score'='syndromic')

# Complete missing entires with zeros and add stats for all genes for comparison
syndromic_count_by_lfc = data.frame('lfc' = unique(pcas_genes$lfc), 'score'='syndromic') %>% filter(lfc!=-1) %>% 
                         full_join(syndromic_count_by_lfc, by=c('lfc','score')) %>% replace(.,is.na(.),0) %>%
                         rbind(all_count_pcnt) %>% mutate('score'=ordered(score, levels=c('syndromic','All')))


ggplotly(syndromic_count_by_lfc %>% ggplot(aes(lfc, pcnt, color=score)) + geom_point() + geom_line() + 
         scale_colour_manual(palette=gg_colour_hue) + theme_minimal() + 
         ggtitle('% of points left after each increase in log2 fold change'))

All genes together

ggplotly(pcas_genes %>% ggplot(aes(PC1, PC2)) + geom_point(aes(frame=lfc, ids=ID, alpha=0.3)) + 
         theme_minimal() + ggtitle('Genes PCA plot modifying filtering threshold'))